In [1]:
# import libraries
# ================

# for date and time opeations
from datetime import datetime
# for file and folder operations
import os
# for regular expression opeations
import re
# for listing files in a folder
import glob
# for getting web contents
import requests 
# storing and analysing data
import pandas as pd
# for scraping web contents
from bs4 import BeautifulSoup
import re
In [2]:
# get data
# ========

# link at which web data recides
link = 'https://www.mohfw.gov.in/'
# get web data
req = requests.get(link)
# parse web data
soup = BeautifulSoup(req.content, "html.parser")
In [3]:
# find the table
# ==============
# our target table is the last table in the page

# get the table head
# table head may contain the column names, titles, subtitles
thead = soup.find_all('thead')[-1]
# print(thead)

# get all the rows in table head
# it usually have only one row, which has the column names
head = thead.find_all('tr')
# print(head)

# get the table tbody
# it contains the contents
tbody = soup.find_all('tbody')[-1]
# print(tbody)

# get all the rows in table body
# each row is each state's entry
body = tbody.find_all('tr')
# print(body)
In [4]:
# get the table contents
# ======================

# container for header rows / column title
head_rows = []
# container for table body / contents
body_rows = []

# loop through the head and append each row to head
for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
# print(head_rows)

# loop through the body and append each row to body
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
# print(head_rows)
In [5]:
# save contents in a dataframe
# ============================
    
# skip last 3 rows, it contains unwanted info
# head_rows contains column title
df_bs = pd.DataFrame(body_rows[:len(body_rows)-6], 
                     columns=head_rows[0])         

# Drop 'S. No.' column
df_bs.drop('S. No.', axis=1, inplace=True)

# there are 36 states+UT in India
df_bs.head(36)

df_bs=df_bs.rename(columns={"Name of State / UT": "State"}, errors="raise")
In [6]:
# date-time information
# =====================

# today's date
now  = datetime.now()
# format date to month-day-year
df_bs['Date'] = now.strftime("%m/%d/%Y") 

# add 'Date' column to dataframe
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')

# df_bs.head(36)
In [7]:
# remove extra characters from 'Name of State/UT' column
df_bs['State'] = df_bs['State'].str.replace('#', '')
In [8]:
# latitude and longitude information
# ==================================

# latitude of the states
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670, 'Dadar Nagar Haveli' : 20.1809, 'Sikkim':27.5330}

# longitude of the states
long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662, 'Dadar Nagar Haveli' : 73.0169, 'Sikkim':88.5122}

# add latitude column based on 'Name of State / UT' column
df_bs['Latitude'] = df_bs['State'].map(lat)

# add longitude column based on 'Name of State / UT' column
df_bs['Longitude'] = df_bs['State'].map(long)
In [9]:
# read data
# data about number of goverment hospitals and beds in hospital
##df1 = pd.read_csv('https://raw.githubusercontent.com/souroy12/Hospital-Bed-Analysis/master/Number%20of%20Government%20Hospitals%20and%20Beds%20in%20Rural%20and%20Urban%20Areas.csv')
# read data
# data about number of goverment hospitals and beds in hospital
df1 = pd.read_csv('C://Users/skoul2/AnacondaProjects/Covid/datasets_Number of Hospitals and Beds in Public and Private Areas .csv')
In [10]:
combined_df=pd.merge(df_bs, df1, how='left')
In [11]:
combined_df = combined_df.drop(17)
In [12]:
combined_df=combined_df.rename(columns={"Deaths**": "Deaths"}, errors="raise")
combined_df=combined_df.rename(columns={"Total Confirmed cases*": "Confirmed"}, errors="raise")
combined_df=combined_df.rename(columns={"Cured/Discharged/Migrated*": "Cured_Discharged_Migrated"}, errors="raise")
combined_df=combined_df.rename(columns={"Active Cases*": "Active"}, errors="raise")
In [13]:
#convert data types of numerical values into int
combined_df[['Active','Deaths','Confirmed','Cured_Discharged_Migrated']]=combined_df[['Active','Deaths','Confirmed','Cured_Discharged_Migrated']].astype(int) 
In [14]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print(color.BOLD,color.RED,color.UNDERLINE + 'Current Situation in India according www.mohfw.gov.in and this code will pull the latest data from this site through Web scraping' + color.END)
  Current Situation in India according www.mohfw.gov.in and this code will pull the latest data from this site through Web scraping
In [15]:
print('3D Statewise view for Cured/Discharged/Migrated, Deaths and Confirmed')
3D Statewise view for Cured/Discharged/Migrated, Deaths and Confirmed
In [16]:
import plotly.express as px
import numpy as np
fig = px.scatter_3d(combined_df, x='Cured_Discharged_Migrated', y='Active', z='Deaths',size='Confirmed',  color='State')
fig.update_layout(height=800, width=800,scene_zaxis_type="log",scene_yaxis_type="log",scene_xaxis_type="log")
fig.show()
In [17]:
import folium
indiacovid = folium.Map(location=[20.5937,78.9629], zoom_start=5.4)
for lat, lon,State,Death,Total_confirmed_cases in zip(combined_df['Latitude'], combined_df['Longitude'],combined_df['State'],combined_df['Deaths'],combined_df['Confirmed']):
    folium.CircleMarker([lat, lon],
                        radius=5,
                        color='Green',
                      popup =('State:' + str(State) + '<br>'
                             'Total Confirmed cases:' + str(Total_confirmed_cases) + '<br>',
                              'Death :' + str(Death) +'<br>'
                             ),
                        fill_color='Yellow',
                        fill_opacity=0.7 ).add_to(indiacovid)
    folium
In [18]:
print('Geographical & holistic view of  patients and beds availability across hospitals')
Geographical & holistic view of  patients and beds availability across hospitals
In [19]:
##import folium
###indiacovid = folium.Map(location=[20.5937,78.9629], zoom_start=5.4)
for i in range(0, len(combined_df)):
   folium.Circle(
        location=[combined_df.iloc[i]['Latitude'], combined_df.iloc[i]['Longitude']],
        color='crimson', 
        tooltip =   '<li><bold>State : '+str(combined_df.iloc[i]['State'])+
                    '<li><bold>Total Beds available : '+str(combined_df.iloc[i]['total_beds'])+
                    '<li><bold>Total Beds available in private hospital : '+str(combined_df.iloc[i]['beds_private_sector'])+
                    '<li><bold>Total Beds available in Public hospital : '+str(combined_df.iloc[i]['beds_ public_sector'])+
                    '<li><bold>Total hospitals : '+str(combined_df.iloc[i]['total_hospitals'])+
                    '<li><bold>Total Public hospitals : '+str(combined_df.iloc[i]['hospitals_ public_sector'])+
                    '<li><bold>Total Private hospitals : '+str(combined_df.iloc[i]['hospitals_ private_sector'])+
                    '<li><bold>Active cases : '+str(combined_df.iloc[i]['Active'])+
                    '<li><bold>Deaths : '+str(combined_df.iloc[i]['Deaths'])+
                    '<li><bold>Cured_Discharged_Migrated : '+str(combined_df.iloc[i]['Cured_Discharged_Migrated'])+
                    '<li><bold>Total Confirmed : '+str(combined_df.iloc[i]['Confirmed']),
        radius=int(combined_df.iloc[i]['Confirmed'])**1.1).add_to(indiacovid)
indiacovid
Out[19]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [20]:
print('Total Deaths,Active,Cured/Discharged/Migrated  as per each state')
Total Deaths,Active,Cured/Discharged/Migrated  as per each state
In [21]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,10))
plt.barh(combined_df['State'],combined_df['Deaths'], label = 'Deaths')
plt.barh(combined_df['State'],combined_df['Active'], label = 'Total Active')
plt.barh(combined_df['State'],combined_df['Cured_Discharged_Migrated'], label = ' Total Cured/Discharged/Migrated')
plt.ylabel('States')
plt.xlabel('Confirmed')
plt.legend()
plt.show()
In [22]:
###Features. Select features
##Dates. Filter train data from 2020-03-01 to 2020-03-18
##Log transformation. Apply log transformation to ConfirmedCases and Fatalities
##Infinites. Replace infinites from the logarithm with 0. Given the asymptotic behavior of the logarithm for log(0),this implies that when applying the inverse transformation (exponential) a 1 will be returned instead of a 0. This problem does not impact many countries, but still needs to be tackled sooner or later in order to obtain a clean solution.
###Train/test split. Split into train/valid/test
###Prediction. Linear Regression, training country by country and joining data
###Submit. Submit results in the correct format, and applying exponential to reverse log transformation
In [23]:
combined_df_ml=combined_df
In [24]:
#Overall 
import plotly.graph_objs as go
ac=   combined_df['Active'].sum()
rvd = combined_df['Cured_Discharged_Migrated'].sum()
dth = combined_df['Deaths'].sum()

fig = go.Figure(data=[go.Pie(labels=['Active','Cured','Death'],
                             values= [ac,rvd,dth],hole =.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=['#263fa3', '#2fcc41','#cc3c2f'], line=dict(color='#FFFFFF', width=2)))
fig.update_layout(title_text='Total overall cases',plot_bgcolor='rgb(275, 270, 273)')
fig.show()
In [25]:
fig = plt.figure(figsize=(10,10))
conf_per_country = combined_df.groupby('State')['Confirmed'].sum().sort_values(ascending=False)
conf_sum=combined_df['Confirmed'].sum()
def absolute_value(val):
    a  = val
    return (np.round(a,2))
conf_per_country.plot(kind="pie",title='Percentage of confirmed cases per state',autopct=absolute_value)

plt.show ()
In [26]:
df2=combined_df.groupby('State')[['Cured_Discharged_Migrated','Deaths','Confirmed']].sum()
In [27]:
df2=df2.nlargest(20,'Confirmed')
plt.figure(figsize=(20,10))
plt.title('top 20 states with confirmed cases',fontsize=30)
plt.xticks(rotation=90,fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('State',fontsize=20)
plt.ylabel('Cases',fontsize=20)
plt.plot(df2.index,df2.Confirmed,marker='o',mfc='black',label='Confirmed',markersize=10,linewidth=5)
plt.plot(df2.index,df2.Deaths,marker='o',mfc='black',label='Deaths',markersize=10,linewidth=5)
plt.plot(df2.index,df2.Cured_Discharged_Migrated,marker='o',mfc='black',label='Cured_Discharged_Migrated',markersize=10,linewidth=5,color='green')
plt.legend(fontsize=20)
Out[27]:
<matplotlib.legend.Legend at 0x179c1236438>
In [28]:
# read data
# data about number of goverment hospitals and beds in hospital
##df1 = pd.read_csv('https://raw.githubusercontent.com/souroy12/Hospital-Bed-Analysis/master/Number%20of%20Government%20Hospitals%20and%20Beds%20in%20Rural%20and%20Urban%20Areas.csv')
# read data
# data about number of goverment hospitals and beds in hospital
combined_df_ml = pd.read_csv('C://Users/skoul2/AnacondaProjects/Covid/datasets_covid_19_india.csv')
In [29]:
#country_data['GrowthFactor'] = growth_factor(country_data['Confirmed'])
from scipy import optimize

# we will want x_data to be the number of days since first confirmed and the y_data to be the confirmed data. This will be the data we use to fit a logistic curve
x_data = range(len(combined_df_ml.index))
y_data = combined_df_ml['Confirmed']

def log_curve(x, k, x_0, ymax):
    return ymax / (1 + np.exp(-k*(x-x_0)))

# Fit the curve
popt, pcov = optimize.curve_fit(log_curve, x_data, y_data, bounds=([0,0,0],np.inf), maxfev=50000)
estimated_k, estimated_x_0, ymax= popt


# Plot the fitted curve
k = estimated_k
x_0 = estimated_x_0
y_fitted = log_curve(range(0,160), k, x_0, ymax)
print(k, x_0, ymax)
#print(y_fitted)
##y_data.tail()
0.002069650317466136 2562.0641623758966 10978.633857826273
In [30]:
print('Growthfactor for confirmed cases')
Growthfactor for confirmed cases
In [31]:
# Plot everything for illustration
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(0,160), y_fitted, '--', label='fitted')
ax.plot(x_data, y_data, 'o', label='Confirmed Data')
Out[31]:
[<matplotlib.lines.Line2D at 0x179c2250438>]
In [32]:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
combined_df_ml['State/UnionTerritory']=lbl.fit_transform(combined_df_ml['State/UnionTerritory'])
In [33]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import datetime
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn import metrics
In [34]:
x=combined_df_ml[['Deaths',]]
y=combined_df_ml['Confirmed']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
In [35]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
Out[35]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [36]:
# Predicting the Test set results
y_pred = regressor.predict(x_test)
In [37]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured Covid Cases ')
ax.set_ylabel('Predicted Covid Cases')
plt.title('Measured Vs Predicted Covid-19 cases')
plt.show()
print('Predicted Covid-19 cases per dealth')
Predicted Covid-19 cases per dealth